library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(MASS)
## Warning: package 'MASS' was built under R version 4.2.2
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(rstanarm)
## Warning: package 'rstanarm' was built under R version 4.2.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.2.2
## This is rstanarm version 2.21.3
## - See https://mc-stan.org/rstanarm/articles/priors for changes to default priors!
## - Default priors may change, so it's safest to specify priors, even if equivalent to the defaults.
## - For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores())
library(data.table)
## Warning: package 'data.table' was built under R version 4.2.2
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(ggsoccer)
## Warning: package 'ggsoccer' was built under R version 4.2.3
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.2.2
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(stringi)
## Warning: package 'stringi' was built under R version 4.2.2
library(rstan)
## Warning: package 'rstan' was built under R version 4.2.3
## Loading required package: StanHeaders
## Warning: package 'StanHeaders' was built under R version 4.2.3
##
## rstan version 2.26.22 (Stan version 2.26.1)
##
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
## change `threads_per_chain` option:
## rstan_options(threads_per_chain = 1)
##
## Do not specify '-march=native' in 'LOCAL_CPPFLAGS' or a Makevars file
##
## Attaching package: 'rstan'
##
## The following object is masked from 'package:tidyr':
##
## extract
library(bayesplot)
## Warning: package 'bayesplot' was built under R version 4.2.3
## This is bayesplot version 1.10.0
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
library(rstanarm)
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:rstanarm':
##
## compare_models, R2
##
## The following object is masked from 'package:purrr':
##
## lift
We also run the following code for making the usage of
rstan easier later on.
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
We first want to load in the events data, which split across multiple csv files, grouped by country. To read this in, I have defined a loop that iterates through the data directory and appends each file to a data-frame.
#Define Data Directory
dir_path <- "../Data/events"
#Get a list of files in the directory
file_list <- list.files(dir_path)
#Create an empty data frame to store the file contents
actions <- data.frame()
#Loop through the files and add their contents to the data frame
for (i in 1:length(file_list)) {
#Read the file into a data frame
file_data <- fread(file.path(dir_path, file_list[i]))
#Add the file data to the main data frame
actions <- rbind(actions, file_data)
}
#View the events data frame
actions
We next want to load in the player data
#Read in data
players <- fromJSON("../Data/players/players.json")
#View player data
players
Since the event data contains all sorts of event types, we now want to filter out only the shots.
# Extract observations of shots from the actions data
shots_df <- actions %>% dplyr::filter(subEventName == "Shot")
The next section is dedicated to loading the event data
We now use documentation from here to mutate the data, and garner more useful categorical data about the shot itself.
shots_df <- shots_df %>%
#If the shot is successful
mutate('is_goal' = ifelse(grepl(" 101}", shots_df$tags),1,0),
#If the shot is at the end of a counter-attack
'is_CA' = ifelse(grepl(" 1901}", shots_df$tags),1,0),
#If the shot is with the foot or another part of the body
'body_part' = ifelse(grepl(" 401}", shots_df$tags),"left",
ifelse(grepl(" 402}", shots_df$tags), "right",
ifelse(grepl(" 403}", shots_df$tags), "body", "NA"))),
#If the shot is blocked
'is_blocked' = ifelse(grepl(" 2101}", shots_df$tags), 1,0))
#Filter out only unblocked shots
shots_df <- shots_df %>% dplyr::filter(is_blocked == 0)
#Keep necessary categorical data
shots_cat <- dplyr::select(shots_df, c('playerId', 'is_goal', 'is_CA', 'body_part'))
summary(shots_cat)
## playerId is_goal is_CA body_part
## Min. : 0 Min. :0.0000 Min. :0.00000 Length:32851
## 1st Qu.: 11066 1st Qu.:0.0000 1st Qu.:0.00000 Class :character
## Median : 25707 Median :0.0000 Median :0.00000 Mode :character
## Mean : 93637 Mean :0.1366 Mean :0.05793
## 3rd Qu.:142755 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :552555 Max. :1.0000 Max. :1.00000
We now calculate the distance from the goal and angle to goal for each shot.
First, we would like to define the position of each shot on the pitch
#Extract all numeric entries from the positions column
pos <- str_extract_all(gsub('"', "", shots_df$positions), "\\d+")
#Define vectors to store coordinates
x_pos <- c()
y_pos <- c()
#Loops that extract the coordinates
for (i in 1:length(pos)){
x_pos <- append(x_pos, pos[[i]][2])
}
for (i in 1:length(pos)){
y_pos <- append(y_pos, pos[[i]][1])
}
#Convert coordinates to numeric data
x_pos <- x_pos %>% as.numeric()
y_pos <- y_pos %>% as.numeric()
# Create coordinate dataframe
coords <- data.frame(x_pos, y_pos)
We can now use these coordinates to calculate distance and angle to goal
#Define length and width of pitch
pitch_x <- 105
pitch_y <- 68
#We now convert coordinates to meters
x_meter <- coords$x_pos * pitch_x/100
y_meter <- coords$y_pos * pitch_y/100
# Calculate distances
dist <- sqrt((105 - x_meter)^2 + ((32.5) - y_meter)^2)
#Calculate angles
angles <- atan( (7.32 * (105 - x_meter) ) / ( (105 - x_meter)^2 + (32.5 - y_meter)^2 - (7.32/2)^2 )) * 180/pi
We can now merge our useful event data into one data-frame.
#Concatenate data-frames
shots <- data.frame(shots_cat, dist, angles)
The next section is dedicated to loading the player data. For this, we simply filter only by the features that will prove useful later on.
First lets retrieve a vector of all unique players in the current
shots data-base:
#Retrieves unique player ids
player_list <- unique(shots$playerId)
We can now filter the players data-frame to only include
these players
#Filters by player ids between both data frames
shooters <- dplyr::filter(players, wyId %in% player_list)
We can filter this data-frame by the features we need.
#Selects necessary columns
shooters <- dplyr::select(shooters, c('shortName', 'wyId', 'foot'))
Finally, we rename the some columns, for ease later on.
#Renames columns
colnames(shooters)[colnames(shooters) == "wyId"] <- "playerId"
colnames(shooters)[colnames(shooters) == "foot"] <- "preferred_foot"
We will now introduce a preferred foot binary variable.
First we merge all our useful data into one data-frame
#Concatenate data-frames
shots <- merge(shots, shooters, by = "playerId")
We now mutate this to add a column featuring the desired binary variable
#Adds preferred foot binary column
shots <- shots %>%
mutate(preferred_foot_b = ifelse(shots$preferred_foot == shots$body_part, 1, 0))
Finally, we remove the preferred_foot column
#Removes desired column
shots <- shots %>% dplyr::select(-c("preferred_foot"))
Since much of our data is categorical, it is necessary to convert it to the factor type.
#Convert necessary variables to factor
shots$body_part <- shots$body_part %>% as.factor()
shots$is_CA <- shots$is_CA %>% as.factor()
shots$preferred_foot_b <- shots$preferred_foot_b %>% as.factor()
shots$shortName <- shots$shortName %>% as.factor()
Now lets view a summary of our data
summary(shots)
## playerId is_goal is_CA body_part dist
## Min. : 36 Min. :0.0000 0:30946 body : 6645 Min. : 0.54
## 1st Qu.: 11066 1st Qu.:0.0000 1: 1903 left :10225 1st Qu.: 11.56
## Median : 25707 Median :0.0000 right:15979 Median : 15.99
## Mean : 93642 Mean :0.1367 Mean : 17.88
## 3rd Qu.:142755 3rd Qu.:0.0000 3rd Qu.: 24.22
## Max. :552555 Max. :1.0000 Max. :103.97
##
## angles shortName preferred_foot_b
## Min. :-87.00 Cristiano Ronaldo : 155 0:12623
## 1st Qu.: 14.40 L. Insigne : 139 1:20226
## Median : 19.64 H. Kane : 137
## Mean : 23.54 E. D\\u017eeko : 121
## 3rd Qu.: 30.59 L. Messi : 121
## Max. : 89.66 I. Peri\\u0161i\\u0107: 117
## (Other) :32059
If we view a random subset of our data, we observe a problem decoding unicode characters:
shots[90:100,]
So we use the following chunk to decode them
shots$shortName <- stringi::stri_unescape_unicode(shots$shortName)
We can see from the summary there are negative angles in the data, to investigate this further we can look at a histogram
hist(shots$angles)
We observe that there are multiple negative angles. Since most of the angles are correctly positive, we will remove the negative ones from the analysis.
shots <- shots %>% dplyr::filter(shots$angles > 0)
To see the corrected histogram
hist(shots$angles)
Later on we will use playerId to group the data. Since
our data-set is large and spans many countries, there are many different
players in the data-set
length(table(shots$playerId))
## [1] 2292
We see there are 2292 unique player included in the data-set
With this in hand, it would be sensible to limit the amount of “groups” (players) to, say 50. In order to preserve the greatest amount of data, we will use the top 50 most occurring player names.
top_players <- sort(table(shots$playerId), decreasing = T)[1:50]
Now we filter the data based on these players
top_shots <- dplyr::filter(shots, playerId %in% row.names(top_players))
In some of our later models, it is necessary to number the players from 1 to 50. This step is carried out below.
top_shots$bayes_id <- as.numeric(as.factor(top_shots$shortName))
We can now view a summary of our final data
summary(top_shots)
## playerId is_goal is_CA body_part dist
## Min. : 122 Min. :0.0000 0:3716 body : 756 Min. : 3.679
## 1st Qu.: 7905 1st Qu.:0.0000 1: 310 left :1329 1st Qu.:10.974
## Median : 14945 Median :0.0000 right:1941 Median :14.749
## Mean : 53939 Mean :0.1806 Mean :16.472
## 3rd Qu.: 25716 3rd Qu.:0.0000 3rd Qu.:20.832
## Max. :447804 Max. :1.0000 Max. :98.898
## angles shortName preferred_foot_b bayes_id
## Min. : 2.041 Length:4026 0:1539 Min. : 1.0
## 1st Qu.:15.430 Class :character 1:2487 1st Qu.:12.0
## Median :22.007 Mode :character Median :24.0
## Mean :25.839 Mean :24.7
## 3rd Qu.:32.098 3rd Qu.:36.0
## Max. :89.660 Max. :50.0
To better understand our distance and angle data, we can create the following boxplots.
#Defines and distance boxplot
dist_boxplot <- ggplot(top_shots, aes(x=is_goal, y=dist, fill = as.factor(is_goal))) +
geom_boxplot() +
labs(title="Distributions Of Distances Grouped By Shot Outcome",
x="Shot Outcome",
y="Distance To Goal (m)") +
coord_flip()
dist_boxplot <- dist_boxplot + guides(fill=guide_legend(title="Goal (1) or Not (0)"))
#Defines angles boxplot
angles_boxplot <- ggplot(top_shots, aes(x=is_goal, y=angles, fill = as.factor(is_goal))) +
geom_boxplot() +
labs(title="Distributions Of Angles Grouped By Shot Outcome",
x="Shot Outcome",
y="Angle To Goal (Degrees)") +
coord_flip()
angles_boxplot <- angles_boxplot + guides(fill=guide_legend(title="Goal (1) or Not (0)"))
#Plots distance boxplot
dist_boxplot
#Plots angles boxplot
angles_boxplot
From these we observe there must is likely some relationship between the
outcome of a shot and the position from which it is taken.
Next, in order to motivate prior elicitation, we consider the distribution of our most important parameters.
dist_dist <- ggplot(top_shots, aes(x=dist)) +
geom_histogram(fill = "#00BFC4") +
labs(title="Distributions Of Distances",
x="Distance (m)",
y="Volume")
dist_dist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
angle_dist <- ggplot(top_shots, aes(x=angles)) +
geom_histogram(fill = "#00bfc4") +
labs(title="Distributions Of Angles",
x="Angle (Degrees)",
y="Volume")
angle_dist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We now visualise the frequency at which each player takes a shot, and its corresponding success
Firstly, we need to wrangle the data a bit more:
#Create data-frame from top_players table defined earlier
top_players_df <- data.frame(top_players)
#Rename columns
colnames(top_players_df)[colnames(top_players_df) == "Var1"] <- "playerId"
colnames(top_players_df)[colnames(top_players_df) == "Freq"] <- "shotVolume"
#We add a column containing player name
top_players_df <- merge(top_players_df, distinct(top_shots[, c("playerId", "shortName")]), by = "playerId")
#We create a dataframe where the is_goal variable is numeric
numeric_goals <- top_shots[, c("shortName", "is_goal")]
numeric_goals$is_goal <- as.numeric(as.character(numeric_goals$is_goal))
#We sum up goals scored by player
summed_goals <- numeric_goals %>%
group_by(shortName) %>%
summarise(goals = sum(is_goal))
#Merge to final data-frame
shots_goals <- merge(top_players_df, summed_goals, by = "shortName")
#Sort in descending order by shot volume
shots_goals <- arrange(shots_goals, desc(shotVolume))
shots_goals$shortName <- shots_goals$shortName %>% as.factor()
Now we visualise the results
#Converts the data into a readable format for ggplot
shots_goals_long <- gather(shots_goals, key = var, value = value, shotVolume, goals)
#Creates the plot structure
shots_goals_plot <- ggplot(shots_goals_long, aes(x=reorder(shortName, -value), y = value, fill = var)) +
geom_col(position = "identity", width = 0.9) +
labs(title="Shots And Goals By Player",
x="Players",
y="Volume") +
scale_x_discrete(guide = guide_axis(angle = 60))
#Adds a legend
shots_goals_plot <- shots_goals_plot + guides(fill=guide_legend(title="Key"))
#Plot
shots_goals_plot
We can see from the plot, that there is some difference in a players ability to convert a shot. We can exploit this difference by adding another level to our models.
top_shots
Since our data-set is somewhat small, it would be wise to have an uneven split of test and train data. This is carried out in the following chunk.
# Split into test and train subsets
train.size <- 0.8 * nrow(top_shots)
train <- sample(1:nrow(top_shots), train.size)
test <- -train
shots.train <- top_shots[train, ]
shots.test <- top_shots[test, ]
is_goal.test <- top_shots$is_goal[test]
First we will fit some simple logistic regression models.
is_goal ~ distFirst we fit a model based on just distance.
glm1 <- glm(is_goal ~ dist, data = shots.train, family = binomial())
summary(glm1)
##
## Call:
## glm(formula = is_goal ~ dist, family = binomial(), data = shots.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0808 -0.6989 -0.5297 -0.2882 3.4795
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.196247 0.122841 1.598 0.11
## dist -0.116261 0.008589 -13.536 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3043.0 on 3219 degrees of freedom
## Residual deviance: 2805.2 on 3218 degrees of freedom
## AIC: 2809.2
##
## Number of Fisher Scoring iterations: 5
confint(glm1)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -0.04346935 0.43821336
## dist -0.13339988 -0.09972087
We see that our confidence intervals are quite small already.
#Make predictions
probs_1 <- predict(glm1, data = shots.test, type = "response")
#Convert probabilities to binary predictions
predictions_1 <- ifelse(probs_1 > 0.5, 1, 0)
#Calculate accuracy
accuracy_1 <- mean(predictions_1 == is_goal.test)
## Warning in predictions_1 == is_goal.test: longer object length is not a
## multiple of shorter object length
#Output accuracy
accuracy_1
## [1] 0.8198758
We also a achieve a test accuracy of 81.5%.
is_goal ~ dist + anglesglm2 <- glm(is_goal ~ dist + angles, data = shots.train, family = binomial())
summary(glm2)
##
## Call:
## glm(formula = is_goal ~ dist + angles, family = binomial(), data = shots.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3632 -0.6670 -0.5141 -0.3287 3.1522
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.830146 0.315249 -2.633 0.008456 **
## dist -0.079280 0.013134 -6.036 1.58e-09 ***
## angles 0.017272 0.004946 3.492 0.000479 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3043.0 on 3219 degrees of freedom
## Residual deviance: 2792.9 on 3217 degrees of freedom
## AIC: 2798.9
##
## Number of Fisher Scoring iterations: 5
confint(glm2)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -1.446067360 -0.20995045
## dist -0.105533919 -0.05403402
## angles 0.007602709 0.02700091
Again we are our confidence intervals are quite small, this is beginning to imply we have sufficient data to make reasonable predictions.
#Make predictions
probs_2 <- predict(glm2, data = shots.test, type = "response")
#Convert probabilities to binary predictions
predictions_2 <- ifelse(probs_2 > 0.5, 1, 0)
#Calculate accuracy
accuracy_2 <- mean(predictions_2 == is_goal.test)
## Warning in predictions_2 == is_goal.test: longer object length is not a
## multiple of shorter object length
#Output Accuracy
accuracy_2
## [1] 0.810559
Interestingly, our predictive accuracy has decreased, but not by an overly significant amount.
is_goal ~ .We finally fit a model with all the relevant predictors, this is primarily to motivate variable selection later on.
glm3 <- glm(is_goal ~ . - shortName - playerId - bayes_id, data = shots.train, family = binomial())
summary(glm3)
##
## Call:
## glm(formula = is_goal ~ . - shortName - playerId - bayes_id,
## family = binomial(), data = shots.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7775 -0.6532 -0.4838 -0.3038 3.3273
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.688160 0.348993 -4.837 1.32e-06 ***
## is_CA1 0.329594 0.174209 1.892 0.0585 .
## body_partleft 1.114883 0.163285 6.828 8.62e-12 ***
## body_partright 0.876807 0.191122 4.588 4.48e-06 ***
## dist -0.095536 0.013761 -6.943 3.85e-12 ***
## angles 0.023472 0.005191 4.521 6.14e-06 ***
## preferred_foot_b1 0.169228 0.139695 1.211 0.2257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3043 on 3219 degrees of freedom
## Residual deviance: 2710 on 3213 degrees of freedom
## AIC: 2724
##
## Number of Fisher Scoring iterations: 5
We observe from this final fit, that due to their low p-values,
angles, dist, and body_part are
the most important variables when determining the outcome of a shot.
probs_3 <- predict(glm3, data = shots.test, type = "response")
# Convert probabilities to binary predictions
predictions_3 <- ifelse(probs_3 > 0.5, 1, 0)
# Calculate accuracy
accuracy_3 <- mean(predictions_3 == is_goal.test)
## Warning in predictions_3 == is_goal.test: longer object length is not a
## multiple of shorter object length
accuracy_3
## [1] 0.8009317
Once again, interestingly our predictive accuracy has decreased again.
In this section we fit our Bayesian models.
is_goal ~ distWe next fit our single level Bayesian models, and produce helpful figures
#Creates a design matrix for stan
bmod1_X <- model.matrix(is_goal ~ dist, data = shots.train)
#Creates a design matrix for predictions
bmod1_X_new <- model.matrix(is_goal ~ dist, data = shots.test)
#Defines stan list
bmod1_list <- list(y = as.numeric(as.character(shots.train$is_goal)),
n = dim(shots.train)[1],
p=2,
X = bmod1_X,
#Predictive Inputs
n_new = dim(shots.test)[1],
X_new = bmod1_X_new,
#Prior parameters
beta_mu = 19,
beta_sigma = 10)
#Runs Stan
bmod1 <- stan(file = "../Stan Files/dist.stan", data = bmod1_list, chains = 4, iter = 1000, init = 0, seed = 1)
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
print(bmod1, pars="beta")
## Inference for Stan model: anon_model.
## 4 chains, each with iter=1000; warmup=500; thin=1;
## post-warmup draws per chain=500, total post-warmup draws=2000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## beta[1] 0.19 0.01 0.13 -0.06 0.10 0.19 0.27 0.43 387 1.01
## beta[2] -0.12 0.00 0.01 -0.13 -0.12 -0.12 -0.11 -0.10 405 1.01
##
## Samples were drawn using NUTS(diag_e) at Wed Apr 19 14:33:53 2023.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
We observe that our confidence intervals have not changed much, which implies our injection of information is not making much difference.
We also observe a high n_eff and Rhat, which means our models are converging well.
plot(bmod1, pars="beta")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
Here is a visualisation of the confidence intervals on our beta.
traceplot(bmod1, pars='beta')
We also observe that our traceplots are not varying much with each
chain, implying good convergence.
#Extract fit
ext_fit_1 <- extract(bmod1)
#Calculate accuracy
mean(apply(ext_fit_1$y_new, 2, median) == is_goal.test)
## [1] 0.8200993
We observe a higher prediction accuracy than in the baselines, but by a marginal amount.
Further graphs used in report can be found in shinystan
#launch_shinystan(bmod1)
is_goal ~ dist +
body_part#Creates design matrix for training and testing
bmod2_X <- model.matrix(is_goal ~ dist + body_part, data = shots.train)
bmod2_X_new <- model.matrix(is_goal ~ dist + body_part, data = shots.test)
#Defines list for stan
bmod2_list <- list(y = as.numeric(as.character(shots.train$is_goal)),
n = dim(shots.train)[1],
X = bmod2_X,
p = 4,
#Predictive Parameters
n_new = dim(shots.test)[1],
X_new = bmod2_X_new,
#Prior Parameters
beta_mu_dist = 19,
beta_sigma_dist = 10
)
#Runs stan
bmod2 <- stan(file = "../Stan Files/dist+body.stan", data = bmod2_list, chains = 4, init = 0, seed = 1)
## recompiling to avoid crashing R session
print(bmod2, pars="beta")
## Inference for Stan model: anon_model.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## beta[1] -0.26 0 0.15 -0.54 -0.36 -0.26 -0.16 0.03 1676 1
## beta[2] -0.14 0 0.01 -0.16 -0.15 -0.14 -0.14 -0.12 2111 1
## beta[3] 1.12 0 0.15 0.84 1.02 1.12 1.22 1.41 1886 1
## beta[4] 0.95 0 0.15 0.68 0.85 0.95 1.05 1.24 1831 1
##
## Samples were drawn using NUTS(diag_e) at Wed Apr 19 14:36:15 2023.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
We again observe no significant change in the confidence intervals, and again a high n_eff and Rhat.
plot(bmod2, pars="beta")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
Here we observe the confidence intervals once again have not changed a
significant amount.
traceplot(bmod2, pars='beta')
As before our traceplots demonstrate good convergence.
#Extract posterior from stan
ext_fit_2 <- extract(bmod2)
#Calculate accuracy
mean(apply(ext_fit_2$y_new, 2, median) == is_goal.test)
## [1] 0.8287841
We observe a marginally higher test accuracy once again.
Further plots used in report can be found in shinystan
#launch_shinystan(bmod2)
is_goal ~ dist + body_part +
anglesWe finally fit a Bayesian model using the most important variables.
#Creates Design Matrices
bmod3_X <- model.matrix(is_goal ~ dist + body_part + angles, data = shots.train)
bmod3_X_new <- model.matrix(is_goal ~ dist + body_part + angles, data = shots.test)
#Defines list for stan
bmod3_list <- list(y = as.numeric(as.character(shots.train$is_goal)),
n = dim(shots.train)[1],
X = bmod3_X,
p = 5,
#Predictive Parameters
n_new = dim(shots.test)[1],
X_new = bmod3_X_new,
#Prior Parameters
beta_mu_dist = 19,
beta_sigma_dist = 10,
beta_mu_angle = 30,
beta_sigma_angle = 10
)
#Runs Stan
bmod3 <- stan(file = "../Stan Files/dist+angles+body.stan", data = bmod3_list, chains = 4, init = 0, seed = 1)
print(bmod3, pars="beta")
## Inference for Stan model: anon_model.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## beta[1] -1.71 0.01 0.34 -2.39 -1.94 -1.71 -1.48 -1.04 1273 1
## beta[2] -0.09 0.00 0.01 -0.12 -0.10 -0.09 -0.08 -0.07 1462 1
## beta[3] 1.21 0.00 0.15 0.93 1.12 1.21 1.31 1.50 1574 1
## beta[4] 1.04 0.00 0.14 0.76 0.95 1.04 1.14 1.33 1637 1
## beta[5] 0.02 0.00 0.01 0.01 0.02 0.02 0.03 0.03 1460 1
##
## Samples were drawn using NUTS(diag_e) at Wed Apr 19 14:37:43 2023.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
Once again, it appears our confidence intervals have not changed by any significant amount. Further, our n_eff and Rhat statistics are showing good signs of convergence.
plot(bmod3, pars="beta")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
traceplot(bmod3, pars='beta')
We again see good convergence properties in our traceplots.
#Extract fit
ext_fit_3 <- extract(bmod3)
#Calculate accuracy
mean(apply(ext_fit_3$y_new, 2, median) == is_goal.test)
## [1] 0.8300248
We observe a once again a slightly higher predictive accuracy.
Further plots used in report can be found in shinystan.
#launch_shinystan(bmod3)
In the final section, we fit our multi-level model, which aims to take into account the variation between player abilities.
is_goal ~ distance + (1 |
shortName)#Defines list for stan
bhmod1_list <- list(y = as.numeric(as.character(shots.train$is_goal)),
n = dim(shots.train)[1],
X = shots.train$dist,
#Defines Grouping
players = length(unique(shots.train$bayes_id)),
player = shots.train$bayes_id,
#Predictive Inputs
n_new = dim(shots.test)[1],
X_new = shots.test$dist,
#Hyper-Prior Parameters
#alpha_mu_mu = 30,
#alpha_mu_sigma = 10,
#alpha_sigma_rate = 10,
beta_mu_mu = 19,
beta_mu_sigma = 10,
beta_sigma_rate = 10
)
#Runs stan
bhmod1 <- stan(file = "../Stan Files/dist+shortName.stan", data = bhmod1_list, chains = 4, init = 0, seed = 1)
## recompiling to avoid crashing R session
## Warning: There were 11 divergent transitions after warmup. See
## https://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## to find out why this is a problem and how to eliminate them.
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#tail-ess
print(bhmod1, pars="beta")
## Inference for Stan model: anon_model.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## beta[1] -0.13 0 0.02 -0.18 -0.15 -0.13 -0.12 -0.10 861 1.01
## beta[2] -0.11 0 0.02 -0.15 -0.12 -0.11 -0.10 -0.08 1309 1.00
## beta[3] -0.14 0 0.02 -0.19 -0.16 -0.14 -0.13 -0.11 621 1.01
## beta[4] -0.11 0 0.02 -0.14 -0.12 -0.11 -0.10 -0.08 1026 1.00
## beta[5] -0.12 0 0.02 -0.15 -0.13 -0.12 -0.11 -0.09 1153 1.00
## beta[6] -0.13 0 0.02 -0.18 -0.15 -0.13 -0.12 -0.10 682 1.01
## beta[7] -0.11 0 0.02 -0.14 -0.12 -0.11 -0.10 -0.08 1224 1.00
## beta[8] -0.14 0 0.02 -0.17 -0.15 -0.13 -0.12 -0.10 729 1.01
## beta[9] -0.12 0 0.02 -0.16 -0.13 -0.12 -0.11 -0.09 1055 1.00
## beta[10] -0.11 0 0.02 -0.15 -0.12 -0.11 -0.10 -0.08 1162 1.00
## beta[11] -0.14 0 0.02 -0.18 -0.15 -0.14 -0.13 -0.11 569 1.01
## beta[12] -0.14 0 0.02 -0.18 -0.15 -0.14 -0.12 -0.10 603 1.01
## beta[13] -0.13 0 0.02 -0.17 -0.14 -0.13 -0.12 -0.10 935 1.00
## beta[14] -0.13 0 0.02 -0.17 -0.14 -0.13 -0.12 -0.10 964 1.00
## beta[15] -0.12 0 0.02 -0.15 -0.13 -0.12 -0.11 -0.09 898 1.01
## beta[16] -0.11 0 0.02 -0.15 -0.13 -0.11 -0.10 -0.08 1193 1.00
## beta[17] -0.12 0 0.02 -0.15 -0.13 -0.12 -0.10 -0.08 1283 1.00
## beta[18] -0.14 0 0.02 -0.18 -0.15 -0.14 -0.12 -0.10 674 1.00
## beta[19] -0.13 0 0.02 -0.17 -0.14 -0.13 -0.12 -0.10 850 1.01
## beta[20] -0.12 0 0.02 -0.16 -0.13 -0.12 -0.11 -0.09 908 1.01
## beta[21] -0.11 0 0.02 -0.15 -0.12 -0.11 -0.10 -0.08 887 1.00
## beta[22] -0.13 0 0.02 -0.16 -0.14 -0.13 -0.12 -0.10 934 1.00
## beta[23] -0.11 0 0.02 -0.15 -0.13 -0.11 -0.10 -0.08 1260 1.00
## beta[24] -0.12 0 0.02 -0.15 -0.13 -0.12 -0.11 -0.09 1461 1.00
## beta[25] -0.11 0 0.02 -0.15 -0.12 -0.11 -0.10 -0.08 1434 1.00
## beta[26] -0.12 0 0.02 -0.16 -0.14 -0.12 -0.11 -0.09 1024 1.00
## beta[27] -0.15 0 0.02 -0.19 -0.16 -0.15 -0.14 -0.12 538 1.01
## beta[28] -0.11 0 0.02 -0.14 -0.12 -0.11 -0.10 -0.08 1047 1.00
## beta[29] -0.14 0 0.02 -0.19 -0.16 -0.14 -0.13 -0.10 650 1.01
## beta[30] -0.12 0 0.02 -0.15 -0.13 -0.12 -0.10 -0.08 1054 1.00
## beta[31] -0.12 0 0.02 -0.15 -0.13 -0.12 -0.10 -0.08 1234 1.00
## beta[32] -0.13 0 0.02 -0.17 -0.14 -0.13 -0.12 -0.10 957 1.00
## beta[33] -0.11 0 0.02 -0.14 -0.12 -0.11 -0.10 -0.07 1369 1.00
## beta[34] -0.10 0 0.02 -0.14 -0.12 -0.10 -0.09 -0.07 1162 1.00
## beta[35] -0.10 0 0.02 -0.13 -0.11 -0.10 -0.09 -0.07 832 1.00
## beta[36] -0.12 0 0.02 -0.16 -0.13 -0.12 -0.11 -0.09 1092 1.00
## beta[37] -0.11 0 0.02 -0.15 -0.13 -0.11 -0.10 -0.08 1136 1.00
## beta[38] -0.09 0 0.02 -0.13 -0.11 -0.09 -0.08 -0.06 828 1.00
## beta[39] -0.12 0 0.02 -0.16 -0.13 -0.12 -0.11 -0.09 1099 1.00
## beta[40] -0.13 0 0.02 -0.16 -0.14 -0.13 -0.11 -0.09 910 1.00
## beta[41] -0.12 0 0.02 -0.16 -0.14 -0.12 -0.11 -0.09 933 1.00
## beta[42] -0.12 0 0.02 -0.16 -0.13 -0.12 -0.11 -0.08 1068 1.00
## beta[43] -0.14 0 0.02 -0.19 -0.16 -0.14 -0.13 -0.11 626 1.01
## beta[44] -0.12 0 0.02 -0.16 -0.13 -0.12 -0.11 -0.09 1115 1.00
## beta[45] -0.12 0 0.02 -0.15 -0.13 -0.12 -0.11 -0.09 1093 1.00
## beta[46] -0.11 0 0.02 -0.15 -0.12 -0.11 -0.10 -0.08 1202 1.00
## beta[47] -0.12 0 0.02 -0.15 -0.13 -0.12 -0.10 -0.08 1190 1.00
## beta[48] -0.14 0 0.02 -0.18 -0.15 -0.14 -0.13 -0.10 753 1.01
## beta[49] -0.13 0 0.02 -0.17 -0.14 -0.13 -0.12 -0.10 1261 1.00
## beta[50] -0.14 0 0.02 -0.18 -0.15 -0.14 -0.12 -0.10 767 1.01
##
## Samples were drawn using NUTS(diag_e) at Wed Apr 19 14:40:10 2023.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
Despite the warning in the sampling, our n_eff and Rhat statistics show good convergence. So taking advice from here, we can safely ignore these divergent chains and use our model. Rhat and n_eff is also good for alpha, which can be found in shinystan.
We observe once again that our confidence intervals have not changed a great deal, however this was not the purpose of this final model. We aimed to take into account player variation, which the following plot proves we have, as discussed in the report.
#launch_shinystan(bhmod1)
We next visualise a prediction of the model, to emphasies how this new model now considers player abilities.
#Extracts generated quantities
ext_fit_4 <- extract(bhmod1)
#Constructs a data-frame
salah_pp <- data.frame(Population = ext_fit_4$pp_y_new, MohamedSalah = ext_fit_4$salah_y_new)
#Makes the data-frame readable to ggplot
salah_pp <- reshape2::melt(salah_pp)
## No id variables; using all as measure variables
#Constructs the plot
salah_pp_plot <- ggplot(salah_pp, aes(x=value, fill=variable)) +
geom_density(alpha=.25) +
labs(title="xG Plot Of The Population And Mohamed Salah",
x="xG",
y="Likelihood")
#Adds Legend
salah_pp_plot <- salah_pp_plot + guides(fill=guide_legend(title="Key"))
#Plots
salah_pp_plot